1 Load Source File

Data pre-processing is included, where special chars and minimal stop-words are removed

source("~/Dropbox/Eugenie/scripts/utils.R")

Load additional libraries

## for qdap
library(rJava)
library(qdapRegex)
library(qdapDictionaries)
library(qdapTools)
library(qdap)

## for syuzhet
library(syuzhet)

Get relevant columns

cols <- c('recid', 'item_id', 'user_id', 'text')
reviews2.text <- as.data.frame(reviews2.csv[, cols])

2 Sentiment Analysis by Sentence Using ‘qdap’ Package

2.1 Further Data Pre-Processing with qdap Methods

Clean text further with qdap methods

## text cleanning
check_text(reviews2.text$text)
reviews2.text$text <- replace_contraction(reviews2.text$text)
reviews2.text$text <- replace_number(reviews2.text$text)
reviews2.text$text <- add_missing_endmark(reviews2.text$text)
reviews2.text$text <- add_comma_space(reviews2.text$text)

Split each review into sentences. Note that this line took an hour to run on Eugenie’s laptop.

## takes 1 hour to split sentences
reviews2.text.qdap <- sentSplit(reviews2.text, "text")

2.2 Compute Polarity

Compute polarity scores at the sentence level for each review, and transform the result from a list to a data frame.

## this particular line takes 3 hours to run
pol.reviews2.text <- with(reviews2.text.qdap, polarity(text.var = text, grouping.var = recid))

## transfrom data structure for analysis
qdap.reviews2.text <- colsplit2df(scores(pol.reviews2.text))

Join the scores with more features from the raw data for further analysis

qdap.reviews2.text$recid <- as.numeric(qdap.reviews2.text$recid)
reviews2.csv$recid <- as.numeric(as.character(reviews2.csv$recid))
qdap.reviews2 <- merge(reviews2.csv[,c('recid','rating','text','incentivized','is_deleted','verified_purchaser')], qdap.reviews2.text, by='recid')

The script above computes the sentiment score at the sentence level for each review. We’re not executing it on the fly here because it takes roughly 5 hours in total to generate the result.

Load the pre-computed, joined sentiment score. This is what we would get from executing the script above

## read the sentiment analysis result (using qdap package)
qdap.reviews2 <- read.csv('~/Dropbox/Eugenie/data/processed/qdap-reviews2.csv')

2.3 Analysis

2.3.1 Summary stats checks

qdap.reviews2[, c('incentivized','ave.polarity')] %>%
  group_by(incentivized) %>%
  summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
##   incentivized     ave.polarity
##   <fct>                   <dbl>
## 1 incentivized            0.277
## 2 non-incentivized        0.406

This result is different from the one we saw using ‘sentimentr’ package

## check the records with na values for the ave.polarity
knitr::kable(qdap.nas <- qdap.reviews2[is.na(qdap.reviews2$ave.polarity),],
             caption = 'All reviews with NA ave.polarity score', floating.environment="sidewaystable")
All reviews with NA ave.polarity score
recid rating text incentivized is_deleted verified_purchaser total.sentences total.words ave.polarity sd.polarity stan.mean.polarity
2087 16848826 5 non-incentivized kept verified 1 0 NA NA NA
4746 23150116 5 100% non-incentivized kept verified 1 0 NA NA NA
6594 23201012 5 100 non-incentivized kept verified 1 0 NA NA NA
8643 23996077 3 non-incentivized kept verified 1 0 NA NA NA
21857 30801468 5 10/10 non-incentivized kept verified 1 0 NA NA NA
30596 51835744 5 non-incentivized kept verified 1 0 NA NA NA
42318 74763670 4 non-incentivized kept verified 1 0 NA NA NA
44552 74770498 5 11 non-incentivized kept verified 1 0 NA NA NA
49557 81902483 1 non-incentivized kept verified 1 0 NA NA NA
51228 81911565 1 non-incentivized kept verified 1 0 NA NA NA
53309 84686504 2 non-incentivized kept verified 1 0 NA NA NA
54465 85581501 5 non-incentivized kept verified 1 0 NA NA NA
64531 94859873 5 non-incentivized deleted unverified 1 0 NA NA NA
70497 106271781 5 non-incentivized kept verified 1 0 NA NA NA
76132 110745574 4 non-incentivized kept verified 1 0 NA NA NA
83608 131453099 5 100% non-incentivized kept verified 1 0 NA NA NA
92087 164110427 5 8/10 non-incentivized kept verified 1 0 NA NA NA
108521 189686285 1 non-incentivized kept verified 1 0 NA NA NA
118391 249008323 5 non-incentivized kept verified 1 0 NA NA NA
124412 315873762 5 non-incentivized deleted verified 1 0 NA NA NA
124531 317001064 5 non-incentivized kept verified 1 0 NA NA NA
124568 317001194 5 non-incentivized deleted verified 1 0 NA NA NA
130525 353085494 1 non-incentivized kept verified 1 0 NA NA NA
139718 382134895 5 10/10 non-incentivized kept verified 1 0 NA NA NA
142926 385507230 5 10/10 non-incentivized deleted verified 1 0 NA NA NA
144768 391324730 4 non-incentivized kept verified 1 0 NA NA NA
144979 391724796 5 non-incentivized kept verified 1 0 NA NA NA
156411 411667863 5 non-incentivized kept verified 1 0 NA NA NA
156934 414143097 5 <3 non-incentivized kept verified 1 0 NA NA NA
164624 424141828 5 non-incentivized kept unverified 1 0 NA NA NA
177072 434539930 5 10/10 non-incentivized kept verified 1 0 NA NA NA
180298 434961431 5 10/10 non-incentivized deleted verified 1 0 NA NA NA
183032 439752882 5 10/10 non-incentivized kept verified 1 0 NA NA NA
183211 439753061 1 non-incentivized kept verified 1 0 NA NA NA
184384 439754748 5 non-incentivized kept verified 1 0 NA NA NA
191520 452192856 5 non-incentivized kept verified 1 0 NA NA NA
191834 452193735 5 non-incentivized kept unverified 1 0 NA NA NA
200167 462442213 5 non-incentivized kept verified 1 0 NA NA NA
201569 462445788 5 non-incentivized kept verified 1 0 NA NA NA
202813 462818420 5 non-incentivized kept verified 1 0 NA NA NA
204158 463012977 2 non-incentivized kept verified 1 0 NA NA NA
205479 464876931 4 9/10 non-incentivized kept verified 1 0 NA NA NA
213132 474567469 5 non-incentivized kept verified 1 0 NA NA NA
216996 475698866 5 5/5 non-incentivized kept verified 1 0 NA NA NA
219912 479412460 5 non-incentivized kept verified 1 0 NA NA NA
221172 483235704 5 non-incentivized kept verified 1 0 NA NA NA
221330 483236894 5 non-incentivized kept verified 1 0 NA NA NA
221789 483239957 5 non-incentivized kept verified 1 0 NA NA NA
227709 489373936 3 non-incentivized kept verified 1 0 NA NA NA
227775 489374364 5 non-incentivized kept verified 1 0 NA NA NA
230355 496972834 5 10/10 non-incentivized kept verified 1 0 NA NA NA
231391 496991042 5 10/10 non-incentivized kept verified 1 0 NA NA NA
232654 497936254 5 non-incentivized kept verified 1 0 NA NA NA
233054 499073153 5 5/5 non-incentivized kept verified 1 0 NA NA NA
234626 500156831 5 non-incentivized kept verified 1 0 NA NA NA
238727 505518777 5 non-incentivized kept verified 1 0 NA NA NA
239232 505920173 1 non-incentivized kept verified 1 0 NA NA NA
239360 505921449 5 non-incentivized kept verified 1 0 NA NA NA
239750 506133439 1 non-incentivized kept verified 1 0 NA NA NA
240504 506153997 2 non-incentivized kept verified 1 0 NA NA NA
242307 507134614 5 10/10! non-incentivized kept verified 1 0 NA NA NA
242868 507137938 5 non-incentivized kept verified 1 0 NA NA NA
248267 507949725 5 non-incentivized kept verified 1 0 NA NA NA
248640 507950207 5 non-incentivized kept verified 1 0 NA NA NA
253057 517250976 5 non-incentivized kept verified 1 0 NA NA NA
253708 517745389 5 non-incentivized kept verified 1 0 NA NA NA
255836 521918173 5 non-incentivized kept verified 1 0 NA NA NA
258233 524910051 5 non-incentivized kept verified 1 0 NA NA NA

There’re some NA values for edge cases

2.3.2 Plots

Boxplot: rating vs. ave.polarity

## Warning: Ignoring 68 observations

2.3.3 Fixed Effect Linear Model

Join with selected columns from the raw data

qdap.reviews2 <- merge(qdap.reviews2, reviews2.csv[,c('recid','item_id')])
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- ave.polarity ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = qdap.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = formula.fe, data = qdap.reviews2, model = "within", 
##     index = c("item_id"))
## 
## Unbalanced Panel: n = 101, T = 29-10133, N = 263948
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -2.762940 -0.300630 -0.058203  0.237060  4.102307 
## 
## Coefficients:
##                                Estimate Std. Error t-value  Pr(>|t|)    
## incentivizednon-incentivized  0.0746480  0.0113962  6.5502 5.755e-11 ***
## is_deletedkept               -0.0282067  0.0037962 -7.4303 1.087e-13 ***
## verified_purchaserverified    0.0397443  0.0038586 10.3002 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    54423
## Residual Sum of Squares: 54379
## R-Squared:      0.00080966
## Adj. R-Squared: 0.00041959
## F-statistic: 71.2656 on 3 and 263844 DF, p-value: < 2.22e-16

2.3.4 Correlation: rating vs. review sentiment

cor.test(qdap.reviews2$rating, qdap.reviews2$ave.polarity, method=c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  qdap.reviews2$rating and qdap.reviews2$ave.polarity
## t = 280.03, df = 263946, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4756451 0.4815274
## sample estimates:
##       cor 
## 0.4785916

3 Sentiment Analysis by Sentence Using ‘syuzhet’ Package

3.1 Compute Polarity

The script down below executes a lot faster than the methods from ‘qdap’ package (said by the ‘sentimentr’ description and seen in this particular example as well). It took Eugenie’s laptop around 15 mins in total to execute all four options

syuzhet.reviews2 <- reviews2.text[,c('recid','rating','text')] %>%
  mutate(syuzhet.sentiment = syuzhet::get_sentiment(reviews2.text$text, 'syuzhet')) %>%
  mutate(afinn.sentiment = syuzhet::get_sentiment(syuzhet.reviews2$text, 'afinn')) %>%
  mutate(nrc.sentiment = syuzhet::get_sentiment(syuzhet.reviews2$text, 'nrc')) %>%
  mutate(bing.sentiment = syuzhet::get_sentiment(syuzhet.reviews2$text, 'bing'))

For the sake of time, we’re loading the pre-computed results

syuzhet.reviews2 <- read.csv('~/Dropbox/Eugenie/data/processed/syuzhet-reviews2.csv')

3.2 Analysis

Here we’re showing a brief analysis for all four options ### Summary stats Since ‘syuzhet’ package utilize four lexicons, we’re showing all four summary stats here

syuzhet.reviews2[, c('incentivized','syuzhet.sentiment')] %>%
  group_by(incentivized) %>%
  summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
##   incentivized     syuzhet.sentiment
##   <fct>                        <dbl>
## 1 incentivized                  6.83
## 2 non-incentivized              1.23
syuzhet.reviews2[, c('incentivized','afinn.sentiment')] %>%
  group_by(incentivized) %>%
  summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
##   incentivized     afinn.sentiment
##   <fct>                      <dbl>
## 1 incentivized               12.5 
## 2 non-incentivized            2.78
syuzhet.reviews2[, c('incentivized','nrc.sentiment')] %>%
  group_by(incentivized) %>%
  summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
##   incentivized     nrc.sentiment
##   <fct>                    <dbl>
## 1 incentivized             6.68 
## 2 non-incentivized         0.802
syuzhet.reviews2[, c('incentivized','bing.sentiment')] %>%
  group_by(incentivized) %>%
  summarize_all(mean, na.rm = TRUE)
## # A tibble: 2 x 2
##   incentivized     bing.sentiment
##   <fct>                     <dbl>
## 1 incentivized               7.16
## 2 non-incentivized           1.60

All four lexicon options returned the same observation: the incentivized reviews have higher sentiment scores on average than the non-incentivized reviews

This result aligns with what we saw earlier from the ‘qdap’ package

There’s no NA values generated by these methods

3.2.1 Plots

Boxplot: rating vs. syuzhet.sentiment

Boxplot: rating vs. afinn.sentiment

Boxplot: rating vs. nrc.sentiment

Boxplot: rating vs. bing.sentiment

3.2.2 Fixed Effect Linear Model

Join with selected columns from the raw data

syuzhet.reviews2 <- merge(syuzhet.reviews2, reviews2.csv[,c('recid','item_id')])

Here we’re showing the linear model for all four options

## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- syuzhet.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within", 
##     index = c("item_id"))
## 
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -10.05502  -0.77039  -0.19341   0.58073  28.34164 
## 
## Coefficients:
##                               Estimate Std. Error  t-value  Pr(>|t|)    
## incentivizednon-incentivized -4.894152   0.035167 -139.169 < 2.2e-16 ***
## is_deletedkept               -0.230489   0.011713  -19.678 < 2.2e-16 ***
## verified_purchaserverified   -0.403659   0.011906  -33.904 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    574120
## Residual Sum of Squares: 517970
## R-Squared:      0.097792
## Adj. R-Squared: 0.09744
## F-statistic: 9535.34 on 3 and 263912 DF, p-value: < 2.22e-16
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- afinn.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within", 
##     index = c("item_id"))
## 
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -30.112833  -2.315512  -0.037194   1.701766  52.469213 
## 
## Coefficients:
##                               Estimate Std. Error t-value  Pr(>|t|)    
## incentivizednon-incentivized -8.441802   0.089407 -94.419 < 2.2e-16 ***
## is_deletedkept               -0.511688   0.029778 -17.183 < 2.2e-16 ***
## verified_purchaserverified   -0.686430   0.030270 -22.677 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    3519300
## Residual Sum of Squares: 3348000
## R-Squared:      0.048664
## Adj. R-Squared: 0.048293
## F-statistic: 4500.04 on 3 and 263912 DF, p-value: < 2.22e-16
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- nrc.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within", 
##     index = c("item_id"))
## 
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
## 
## Residuals:
##       Min.    1st Qu.     Median    3rd Qu.       Max. 
## -11.143271  -0.925852  -0.080661   0.587376  32.544075 
## 
## Coefficients:
##                               Estimate Std. Error  t-value  Pr(>|t|)    
## incentivizednon-incentivized -5.320651   0.040027 -132.928 < 2.2e-16 ***
## is_deletedkept               -0.195991   0.013331  -14.701 < 2.2e-16 ***
## verified_purchaserverified   -0.356488   0.013551  -26.306 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    733780
## Residual Sum of Squares: 671020
## R-Squared:      0.085524
## Adj. R-Squared: 0.085167
## F-statistic: 8227.26 on 3 and 263912 DF, p-value: < 2.22e-16
## fix effect linear model
## Use sentence sentiment score to replce rating
formula.fe <- bing.sentiment ~ incentivized + is_deleted + verified_purchaser
model.fe <- plm(data = syuzhet.reviews2, formula = formula.fe, index = c('item_id'), model = 'within')
# get the model summary
summary(model.fe)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = formula.fe, data = syuzhet.reviews2, model = "within", 
##     index = c("item_id"))
## 
## Unbalanced Panel: n = 101, T = 29-10134, N = 264016
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -12.40467  -1.01461  -0.15270   0.87808  29.59394 
## 
## Coefficients:
##                               Estimate Std. Error  t-value  Pr(>|t|)    
## incentivizednon-incentivized -4.904944   0.046670 -105.097 < 2.2e-16 ***
## is_deletedkept               -0.299480   0.015544  -19.266 < 2.2e-16 ***
## verified_purchaserverified   -0.377601   0.015801  -23.898 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    969570
## Residual Sum of Squares: 912270
## R-Squared:      0.0591
## Adj. R-Squared: 0.058733
## F-statistic: 5525.68 on 3 and 263912 DF, p-value: < 2.22e-16

3.2.3 Correlation: rating vs. review sentiment

cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$syuzhet.sentiment, method=c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  syuzhet.reviews2$rating and syuzhet.reviews2$syuzhet.sentiment
## t = 143.17, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2648695 0.2719488
## sample estimates:
##       cor 
## 0.2684128
cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$afinn.sentiment, method=c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  syuzhet.reviews2$rating and syuzhet.reviews2$afinn.sentiment
## t = 183.08, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3322494 0.3390189
## sample estimates:
##       cor 
## 0.3356385
cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$nrc.sentiment, method=c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  syuzhet.reviews2$rating and syuzhet.reviews2$nrc.sentiment
## t = 90.225, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1692463 0.1766471
## sample estimates:
##       cor 
## 0.1729491
cor.test(syuzhet.reviews2$rating, syuzhet.reviews2$bing.sentiment, method=c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  syuzhet.reviews2$rating and syuzhet.reviews2$bing.sentiment
## t = 196.88, df = 264014, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3544634 0.3611157
## sample estimates:
##       cor 
## 0.3577941